/**
 */
#include "../1880v2_test_util.h"
#define OUT
#define IN

using namespace std;
//TODO: get from ctx
static u32 channel = 32; //<! 1880v2 hardcode

//<! 1880v2 hw config
static u32 table_h = 32;
static u32 table_w = 8;
static u32 table_hw = table_h * table_w;

// NOTICE: all inter result save in doulbe unit
static double *sqrt_hw = (double *)malloc(sizeof(double) * table_hw);

// fix range
const static int exp_start = -62;
const static int exp_end = 63;

/**
 * pre_data means we test fixed pattern, it should be same sa lut
 */
enum TEST_MODE {
  PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
  TEST_MODE_MAX,
};

static TEST_MODE mode;

static u16 test_pattern[] = {
  0x0000,
  0x38D2,
  0x3952,
  0x399D,
  0x39D2,
  0x3A03,
  0x3A1D,
  0x3A38,
  0x3A52,
  0x3A6C,
  0x3A83,
  0x3A90,
  0x3A9D,
  0x3AAA,
  0x3AB8,
  0x3AC5,
  0x3AD2,
  0x3ADF,
  0x3AEC,
  0x3AF9,
  0x3B03,
  0x3B0A,
  0x3B10,
  0x3B17,
  0x3B1D,
  0x3B24,
  0x3B2A,
  0x3B31,
  0x3B38,
  0x3B3E,
  0x3B45,
  0x3B4B,
  0x3B52,
  0x3B58,
  0x3B5F,
  0x3B65,
  0x3B6C,
  0x3B72,
  0x3B79,
  0x3B80,
  0x3B83,
  0x3B86,
  0x3B8A,
  0x3B8D,
  0x3B90,
  0x3B93,
  0x3B97,
  0x3B9A,
  0x3B9D,
  0x3BA1,
  0x3BA4,
  0x3BA7,
  0x3BAA,
  0x3BAE,
  0x3BB1,
  0x3BB4,
  0x3BB8,
  0x3BBB,
  0x3BBE,
  0x3BC1,
  0x3BC5,
  0x3BC8,
  0x3BCB,
  0x3BCE,
  0x3BD2,
  0x3BD5,
  0x3BD8,
  0x3BDC,
  0x3BDF,
  0x3BE2,
  0x3BE5,
  0x3BE9,
  0x3BEC,
  0x3BEF,
  0x3BF2,
  0x3BF6,
  0x3BF9,
  0x3BFC,
  0x3C00,
  0x3C01,
  0x3C03,
  0x3C05,
  0x3C06,
  0x3C08,
  0x3C0A,
  0x3C0B,
  0x3C0D,
  0x3C0F,
  0x3C10,
  0x3C12,
  0x3C13,
  0x3C15,
  0x3C17,
  0x3C18,
  0x3C1A,
  0x3C1C,
  0x3C1D,
  0x3C1F,
  0x3C21,
  0x3C22,
  0x3C24,
  0x3C25,
  0x3C27,
  0x3C29,
  0x3C2A,
  0x3C2C,
  0x3C2E,
  0x3C2F,
  0x3C31,
  0x3C33,
  0x3C34,
  0x3C36,
  0x3C38,
  0x3C39,
  0x3C3B,
  0x3C3C,
  0x3C3E,
  0x3C40,
  0x3C41,
  0x3C43,
  0x3C45,
  0x3C46,
  0x3C48,
  0x3C4A,
  0x3C4B,
  0x3C4D,
  0x3C4E,
  0x3C50,
  0x3C52,
  0x3C53,
  0x3C55,
  0x3C57,
  0x3C58,
  0x3C5A,
  0x3C5C,
  0x3C5D,
  0x3C5F,
  0x3C60,
  0x3C62,
  0x3C64,
  0x3C65,
  0x3C67,
  0x3C69,
  0x3C6A,
  0x3C6C,
  0x3C6E,
  0x3C6F,
  0x3C71,
  0x3C72,
  0x3C74,
  0x3C76,
  0x3C77,
  0x3C79,
  0x3C7B,
  0x3C7C,
  0x3C7E,
  0x3C80,
  0x3C81,
  0x3C81,
  0x3C82,
  0x3C83,
  0x3C84,
  0x3C85,
  0x3C86,
  0x3C86,
  0x3C87,
  0x3C88,
  0x3C89,
  0x3C8A,
  0x3C8A,
  0x3C8B,
  0x3C8C,
  0x3C8D,
  0x3C8E,
  0x3C8F,
  0x3C8F,
  0x3C90,
  0x3C91,
  0x3C92,
  0x3C93,
  0x3C93,
  0x3C94,
  0x3C95,
  0x3C96,
  0x3C97,
  0x3C98,
  0x3C98,
  0x3C99,
  0x3C9A,
  0x3C9B,
  0x3C9C,
  0x3C9C,
  0x3C9D,
  0x3C9E,
  0x3C9F,
  0x3CA0,
  0x3CA1,
  0x3CA1,
  0x3CA2,
  0x3CA3,
  0x3CA4,
  0x3CA5,
  0x3CA5,
  0x3CA6,
  0x3CA7,
  0x3CA8,
  0x3CA9,
  0x3CAA,
  0x3CAA,
  0x3CAB,
  0x3CAC,
  0x3CAD,
  0x3CAE,
  0x3CAE,
  0x3CAF,
  0x3CB0,
  0x3CB1,
  0x3CB2,
  0x3CB3,
  0x3CB3,
  0x3CB4,
  0x3CB5,
  0x3CB6,
  0x3CB7,
  0x3CB8,
  0x3CB8,
  0x3CB9,
  0x3CBA,
  0x3CBB,
  0x3CBC,
  0x3CBC,
  0x3CBD,
  0x3CBE,
  0x3CBF,
  0x3CC0,
  0x3CC1,
  0x3CC1,
  0x3CC2,
  0x3CC3,
  0x3CC4,
  0x3CC5,
  0x3CC5,
  0x3CC6,
  0x3CC7,
  0x3CC8,
  0x3CC9,
  0x3CCA,
  0x3CCA,
  0x3CCB,
  0x3CCC,
  0x3CCD,
  0x3CCE,
  0x3CCE,
  0x3CCF,
  0x3CD0,
  0x3CD1,
  0x3CD2,
  0x3CD3,
  0x3CD3,
  0x3CD4,
  0x3CD5,
  0x3CD6,
  0x3CD7,
  0x3CD7,
  0x3CD8,
  0x3CD9,
  0x3CDA,
  0x3CDB,
  0x3CDC,
  0x3CDC,
  0x3CDD,
  0x3CDE,
  0x3CDF,
  0x3CE0,
  0x3CE0,
  0x3CE1,
  0x3CE2,
  0x3CE3,
  0x3CE4,
  0x3CE5,
  0x3CE5,
  0x3CE6,
  0x3CE7,
  0x3CE8,
  0x3CE9,
  0x3CE9,
  0x3CEA,
  0x3CEB,
  0x3CEC,
  0x3CED,
  0x3CEE,
  0x3CEE,
  0x3CEF,
  0x3CF0,
  0x3CF1,
  0x3CF2,
  0x3CF2,
  0x3CF3,
  0x3CF4,
  0x3CF5,
  0x3CF6,
  0x3CF7,
  0x3CF7,
  0x3CF8,
  0x3CF9,
  0x3CFA,
  0x3CFB,
  0x3CFB,
  0x3CFC,
  0x3CFD,
  0x3CFE,
  0x3CFF,
  0x3D00,
  0x3D00,
  0x3D01,
  0x3D01,
  0x3D01,
  0x3D02,
  0x3D02,
  0x3D03,
  0x3D03,
  0x3D03,
  0x3D04,
  0x3D04,
  0x3D05,
  0x3D05,
  0x3D06,
  0x3D06,
  0x3D06,
  0x3D07,
  0x3D07,
  0x3D08,
  0x3D08,
  0x3D08,
  0x3D09,
  0x3D09,
  0x3D0A,
  0x3D0A,
  0x3D0A,
  0x3D0B,
  0x3D0B,
  0x3D0C,
  0x3D0C,
  0x3D0C,
  0x3D0D,
  0x3D0D,
  0x3D0E,
  0x3D0E,
  0x3D0F,
  0x3D0F,
  0x3D0F,
  0x3D10,
  0x3D10,
  0x3D11,
  0x3D11,
  0x3D11,
  0x3D12,
  0x3D12,
  0x3D13,
  0x3D13,
  0x3D13,
  0x3D14,
  0x3D14,
  0x3D15,
  0x3D15,
  0x3D16,
  0x3D16,
  0x3D16,
  0x3D17,
  0x3D17,
  0x3D18,
  0x3D18,
  0x3D18,
  0x3D19,
  0x3D19,
  0x3D1A,
  0x3D1A,
  0x3D1A,
  0x3D1B,
  0x3D1B,
  0x3D1C,
  0x3D1C,
  0x3D1C,
  0x3D1D,
  0x3D1D,
  0x3D1E,
  0x3D1E,
  0x3D1F,
  0x3D1F,
  0x3D1F,
  0x3D20,
  0x3D20,
  0x3D21,
  0x3D21,
  0x3D21,
  0x3D22,
  0x3D22,
  0x3D23,
  0x3D23,
  0x3D23,
  0x3D24,
  0x3D24,
  0x3D25,
  0x3D25,
  0x3D25,
  0x3D26,
  0x3D26,
  0x3D27,
  0x3D27,
  0x3D28,
  0x3D28,
  0x3D28,
  0x3D29,
  0x3D29,
  0x3D2A,
  0x3D2A,
  0x3D2A,
  0x3D2B,
  0x3D2B,
  0x3D2C,
  0x3D2C,
  0x3D2C,
  0x3D2D,
  0x3D2D,
  0x3D2E,
  0x3D2E,
  0x3D2E,
  0x3D2F,
  0x3D2F,
  0x3D30,
  0x3D30,
  0x3D31,
  0x3D31,
  0x3D31,
  0x3D32,
  0x3D32,
  0x3D33,
  0x3D33,
  0x3D33,
  0x3D34,
  0x3D34,
  0x3D35,
  0x3D35,
  0x3D35,
  0x3D36,
  0x3D36,
  0x3D37,
  0x3D37,
  0x3D38,
  0x3D38,
  0x3D38,
  0x3D39,
  0x3D39,
  0x3D3A,
  0x3D3A,
  0x3D3A,
  0x3D3B,
  0x3D3B,
  0x3D3C,
  0x3D3C,
  0x3D3C,
  0x3D3D,
  0x3D3D,
  0x3D3E,
  0x3D3E,
  0x3D3E,
  0x3D3F,
  0x3D3F,
  0x3D40,
  0x3D40,
  0x3D41,
  0x3D41,
  0x3D41,
  0x3D42,
  0x3D42,
  0x3D43,
  0x3D43,
  0x3D43,
  0x3D44,
  0x3D44,
  0x3D45,
  0x3D45,
  0x3D45,
  0x3D46,
  0x3D46,
  0x3D47,
  0x3D47,
  0x3D47,
  0x3D48,
  0x3D48,
  0x3D49,
  0x3D49,
  0x3D4A,
  0x3D4A,
  0x3D4A,
  0x3D4B,
  0x3D4B,
  0x3D4C,
  0x3D4C,
  0x3D4C,
  0x3D4D,
  0x3D4D,
  0x3D4E,
  0x3D4E,
  0x3D4E,
  0x3D4F,
  0x3D4F,
  0x3D50,
  0x3D50,
  0x3D50,
  0x3D51,
  0x3D51,
  0x3D52,
  0x3D52,
  0x3D53,
  0x3D53,
  0x3D53,
  0x3D54,
  0x3D54,
  0x3D55,
  0x3D55,
  0x3D55,
  0x3D56,
  0x3D56,
  0x3D57,
  0x3D57,
  0x3D57,
  0x3D58,
  0x3D58,
  0x3D59,
  0x3D59,
  0x3D59,
  0x3D5A,
  0x3D5A,
  0x3D5B,
  0x3D5B,
  0x3D5C,
  0x3D5C,
  0x3D5C,
  0x3D5D,
  0x3D5D,
  0x3D5E,
  0x3D5E,
  0x3D5E,
  0x3D5F,
  0x3D5F,
  0x3D60,
  0x3D60,
  0x3D60,
  0x3D61,
  0x3D61,
  0x3D62,
  0x3D62,
  0x3D63,
  0x3D63,
  0x3D63,
  0x3D64,
  0x3D64,
  0x3D65,
  0x3D65,
  0x3D65,
  0x3D66,
  0x3D66,
  0x3D67,
  0x3D67,
  0x3D67,
  0x3D68,
  0x3D68,
  0x3D69,
  0x3D69,
  0x3D69,
  0x3D6A,
  0x3D6A,
  0x3D6B,
  0x3D6B,
  0x3D6C,
  0x3D6C,
  0x3D6C,
  0x3D6D,
  0x3D6D,
  0x3D6E,
  0x3D6E,
  0x3D6E,
  0x3D6F,
  0x3D6F,
  0x3D70,
  0x3D70,
  0x3D70,
  0x3D71,
  0x3D71,
  0x3D72,
  0x3D72,
  0x3D72,
  0x3D73,
  0x3D73,
  0x3D74,
  0x3D74,
  0x3D75,
  0x3D75,
  0x3D75,
  0x3D76,
  0x3D76,
  0x3D77,
  0x3D77,
  0x3D77,
  0x3D78,
  0x3D78,
  0x3D79,
  0x3D79,
  0x3D79,
  0x3D7A,
  0x3D7A,
  0x3D7B,
  0x3D7B,
  0x3D7B,
  0x3D7C,
  0x3D7C,
  0x3D7D,
  0x3D7D,
  0x3D7E,
  0x3D7E,
  0x3D7E,
  0x3D7F,
  0x3D7F,
  0x3D80,
  0x3D80,
  0x3D80,
  0x3D80,
  0x3D81,
  0x3D81,
  0x3D81,
  0x3D81,
  0x3D81,
  0x3D82,
  0x3D82,
  0x3D82,
  0x3D82,
  0x3D82,
  0x3D83,
  0x3D83,
  0x3D83,
  0x3D83,
  0x3D83,
  0x3D84,
  0x3D84,
  0x3D84,
  0x3D84,
  0x3D85,
  0x3D85,
  0x3D85,
  0x3D85,
  0x3D85,
  0x3D86,
  0x3D86,
  0x3D86,
  0x3D86,
  0x3D86,
  0x3D87,
  0x3D87,
  0x3D87,
  0x3D87,
  0x3D87,
  0x3D88,
  0x3D88,
  0x3D88,
  0x3D88,
  0x3D88,
  0x3D89,
  0x3D89,
  0x3D89,
  0x3D89,
  0x3D89,
  0x3D8A,
  0x3D8A,
  0x3D8A,
  0x3D8A,
  0x3D8A,
  0x3D8B,
  0x3D8B,
  0x3D8B,
  0x3D8B,
  0x3D8B,
  0x3D8C,
  0x3D8C,
  0x3D8C,
  0x3D8C,
  0x3D8C,
  0x3D8D,
  0x3D8D,
  0x3D8D,
  0x3D8D,
  0x3D8E,
  0x3D8E,
  0x3D8E,
  0x3D8E,
  0x3D8E,
  0x3D8F,
  0x3D8F,
  0x3D8F,
  0x3D8F,
  0x3D8F,
  0x3D90,
  0x3D90,
  0x3D90,
  0x3D90,
  0x3D90,
  0x3D91,
  0x3D91,
  0x3D91,
  0x3D91,
  0x3D91,
  0x3D92,
  0x3D92,
  0x3D92,
  0x3D92,
  0x3D92,
  0x3D93,
  0x3D93,
  0x3D93,
  0x3D93,
  0x3D93,
  0x3D94,
  0x3D94,
  0x3D94,
  0x3D94,
  0x3D94,
  0x3D95,
  0x3D95,
  0x3D95,
  0x3D95,
  0x3D96,
  0x3D96,
  0x3D96,
  0x3D96,
  0x3D96,
  0x3D97,
  0x3D97,
  0x3D97,
  0x3D97,
  0x3D97,
  0x3D98,
  0x3D98,
  0x3D98,
  0x3D98,
  0x3D98,
  0x3D99,
  0x3D99,
  0x3D99,
  0x3D99,
  0x3D99,
  0x3D9A,
  0x3D9A,
  0x3D9A,
  0x3D9A,
  0x3D9A,
  0x3D9B,
  0x3D9B,
  0x3D9B,
  0x3D9B,
  0x3D9B,
  0x3D9C,
  0x3D9C,
  0x3D9C,
  0x3D9C,
  0x3D9C,
  0x3D9D,
  0x3D9D,
  0x3D9D,
  0x3D9D,
  0x3D9D,
  0x3D9E,
  0x3D9E,
  0x3D9E,
  0x3D9E,
  0x3D9F,
  0x3D9F,
  0x3D9F,
  0x3D9F,
  0x3D9F,
  0x3DA0,
  0x3DA0,
  0x3DA0,
  0x3DA0,
  0x3DA0,
  0x3DA1,
  0x3DA1,
  0x3DA1,
  0x3DA1,
  0x3DA1,
  0x3DA2,
  0x3DA2,
  0x3DA2,
  0x3DA2,
  0x3DA2,
  0x3DA3,
  0x3DA3,
  0x3DA3,
  0x3DA3,
  0x3DA3,
  0x3DA4,
  0x3DA4,
  0x3DA4,
  0x3DA4,
  0x3DA4,
  0x3DA5,
  0x3DA5,
  0x3DA5,
  0x3DA5,
  0x3DA5,
  0x3DA6,
  0x3DA6,
  0x3DA6,
  0x3DA6,
  0x3DA7,
  0x3DA7,
  0x3DA7,
  0x3DA7,
  0x3DA7,
  0x3DA8,
  0x3DA8,
  0x3DA8,
  0x3DA8,
  0x3DA8,
  0x3DA9,
  0x3DA9,
  0x3DA9,
  0x3DA9,
  0x3DA9,
  0x3DAA,
  0x3DAA,
  0x3DAA,
  0x3DAA,
  0x3DAA,
  0x3DAB,
  0x3DAB,
  0x3DAB,
  0x3DAB,
  0x3DAB,
  0x3DAC,
  0x3DAC,
  0x3DAC,
  0x3DAC,
  0x3DAC,
  0x3DAD,
  0x3DAD,
  0x3DAD,
  0x3DAD,
  0x3DAD,
  0x3DAE,
  0x3DAE,
  0x3DAE,
  0x3DAE,
  0x3DAE,
  0x3DAF,
  0x3DAF,
  0x3DAF,
  0x3DAF,
  0x3DB0,
  0x3DB0,
  0x3DB0,
  0x3DB0,
  0x3DB0,
  0x3DB1,
  0x3DB1,
  0x3DB1,
  0x3DB1,
  0x3DB1,
  0x3DB2,
  0x3DB2,
  0x3DB2,
  0x3DB2,
  0x3DB2,
  0x3DB3,
  0x3DB3,
  0x3DB3,
  0x3DB3,
  0x3DB3,
  0x3DB4,
  0x3DB4,
  0x3DB4,
  0x3DB4,
  0x3DB4,
  0x3DB5,
  0x3DB5,
  0x3DB5,
  0x3DB5,
  0x3DB5,
  0x3DB6,
  0x3DB6,
  0x3DB6,
  0x3DB6,
  0x3DB6,
  0x3DB7,
  0x3DB7,
  0x3DB7,
  0x3DB7,
  0x3DB8,
  0x3DB8,
  0x3DB8,
  0x3DB8,
  0x3DB8,
  0x3DB9,
  0x3DB9,
  0x3DB9,
  0x3DB9,
  0x3DB9,
  0x3DBA,
  0x3DBA,
  0x3DBA,
  0x3DBA,
  0x3DBA,
  0x3DBB,
  0x3DBB,
  0x3DBB,
  0x3DBB,
  0x3DBB,
  0x3DBC,
  0x3DBC,
  0x3DBC,
  0x3DBC,
  0x3DBC,
  0x3DBD,
  0x3DBD,
  0x3DBD,
  0x3DBD,
  0x3DBD,
  0x3DBE,
  0x3DBE,
  0x3DBE,
  0x3DBE,
  0x3DBE,
  0x3DBF,
  0x3DBF,
  0x3DBF,
  0x3DBF,
  0x3DBF,
  0x3DC0,
  0x3DC0,
  0x3DC0,
  0x3DC0,
  0x3DC1,
  0x3DC1,
  0x3DC1,
  0x3DC1,
  0x3DC1,
  0x3DC2,
  0x3DC2,
  0x3DC2,
  0x3DC2,
  0x3DC2,
  0x3DC3,
  0x3DC3,
  0x3DC3,
  0x3DC3,
  0x3DC3,
  0x3DC4,
  0x3DC4,
  0x3DC4,
  0x3DC4,
  0x3DC4,
  0x3DC5,
  0x3DC5,
  0x3DC5,
  0x3DC5,
  0x3DC5,
  0x3DC6,
  0x3DC6,
  0x3DC6,
  0x3DC6,
  0x3DC6,
  0x3DC7,
  0x3DC7,
  0x3DC7,
  0x3DC7,
  0x3DC7,
  0x3DC8,
  0x3DC8,
  0x3DC8,
  0x3DC8,
  0x3DC8,
  0x3DC9,
  0x3DC9,
  0x3DC9,
  0x3DC9,
  0x3DCA,
  0x3DCA,
  0x3DCA,
  0x3DCA,
  0x3DCA,
  0x3DCB,
  0x3DCB,
  0x3DCB,
  0x3DCB,
  0x3DCB,
  0x3DCC,
  0x3DCC,
  0x3DCC,
  0x3DCC,
  0x3DCC,
  0x3DCD,
};

static u16 sigmode_golden_bf16[] = {
  0x4f00,
  0x42d0,
  0x4293,
  0x426f,
  0x4250,
  0x4234,
  0x4229,
  0x421e,
  0x4213,
  0x4208,
  0x41fe,
  0x41f7,
  0x41ef,
  0x41e7,
  0x41df,
  0x41d8,
  0x41d0,
  0x41c8,
  0x41c1,
  0x41b9,
  0x41b4,
  0x41b1,
  0x41ae,
  0x41ab,
  0x41a9,
  0x41a6,
  0x41a4,
  0x41a1,
  0x419e,
  0x419b,
  0x4198,
  0x4196,
  0x4193,
  0x4191,
  0x418e,
  0x418b,
  0x4188,
  0x4186,
  0x4183,
  0x4180,
  0x417e,
  0x417c,
  0x417a,
  0x4178,
  0x4177,
  0x4175,
  0x4173,
  0x4171,
  0x416f,
  0x416d,
  0x416b,
  0x4169,
  0x4167,
  0x4165,
  0x4163,
  0x4162,
  0x415f,
  0x415d,
  0x415c,
  0x415a,
  0x4158,
  0x4156,
  0x4154,
  0x4152,
  0x4150,
  0x414e,
  0x414c,
  0x414a,
  0x4148,
  0x4147,
  0x4145,
  0x4142,
  0x4141,
  0x413f,
  0x413d,
  0x413b,
  0x4139,
  0x4137,
  0x4135,
  0x4135,
  0x4134,
  0x4133,
  0x4133,
  0x4132,
  0x4131,
  0x4130,
  0x4130,
  0x412f,
  0x412e,
  0x412e,
  0x412d,
  0x412c,
  0x412b,
  0x412b,
  0x412a,
  0x4129,
  0x4129,
  0x4128,
  0x4127,
  0x4127,
  0x4126,
  0x4126,
  0x4125,
  0x4124,
  0x4124,
  0x4123,
  0x4122,
  0x4122,
  0x4121,
  0x4120,
  0x411f,
  0x411f,
  0x411e,
  0x411d,
  0x411d,
  0x411c,
  0x411b,
  0x411a,
  0x411a,
  0x4119,
  0x4118,
  0x4118,
  0x4117,
  0x4116,
  0x4116,
  0x4115,
  0x4115,
  0x4114,
  0x4113,
  0x4113,
  0x4112,
  0x4111,
  0x4111,
  0x4110,
  0x410f,
  0x410e,
  0x410e,
  0x410d,
  0x410c,
  0x410c,
  0x410b,
  0x410a,
  0x410a,
  0x4109,
  0x4108,
  0x4107,
  0x4107,
  0x4106,
  0x4106,
  0x4105,
  0x4104,
  0x4104,
  0x4103,
  0x4102,
  0x4102,
  0x4101,
  0x4100,
  0x40ff,
  0x40ff,
  0x40ff,
  0x40fe,
  0x40fe,
  0x40fd,
  0x40fc,
  0x40fc,
  0x40fc,
  0x40fb,
  0x40fb,
  0x40fa,
  0x40fa,
  0x40fa,
  0x40f9,
  0x40f8,
  0x40f8,
  0x40f7,
  0x40f7,
  0x40f7,
  0x40f6,
  0x40f5,
  0x40f5,
  0x40f5,
  0x40f4,
  0x40f4,
  0x40f3,
  0x40f3,
  0x40f2,
  0x40f2,
  0x40f1,
  0x40f1,
  0x40f0,
  0x40f0,
  0x40f0,
  0x40ef,
  0x40ee,
  0x40ee,
  0x40ed,
  0x40ed,
  0x40ed,
  0x40ec,
  0x40eb,
  0x40eb,
  0x40ea,
  0x40ea,
  0x40ea,
  0x40e9,
  0x40e9,
  0x40e8,
  0x40e7,
  0x40e7,
  0x40e7,
  0x40e6,
  0x40e6,
  0x40e5,
  0x40e5,
  0x40e4,
  0x40e4,
  0x40e3,
  0x40e3,
  0x40e2,
  0x40e2,
  0x40e2,
  0x40e1,
  0x40e0,
  0x40e0,
  0x40df,
  0x40df,
  0x40df,
  0x40de,
  0x40dd,
  0x40dd,
  0x40dd,
  0x40dc,
  0x40dc,
  0x40db,
  0x40da,
  0x40da,
  0x40da,
  0x40d9,
  0x40d9,
  0x40d8,
  0x40d8,
  0x40d8,
  0x40d7,
  0x40d6,
  0x40d6,
  0x40d5,
  0x40d5,
  0x40d5,
  0x40d4,
  0x40d3,
  0x40d3,
  0x40d2,
  0x40d2,
  0x40d2,
  0x40d1,
  0x40d1,
  0x40d0,
  0x40cf,
  0x40cf,
  0x40cf,
  0x40ce,
  0x40ce,
  0x40cd,
  0x40cd,
  0x40cc,
  0x40cc,
  0x40cb,
  0x40cb,
  0x40ca,
  0x40ca,
  0x40ca,
  0x40c9,
  0x40c8,
  0x40c8,
  0x40c8,
  0x40c7,
  0x40c7,
  0x40c6,
  0x40c5,
  0x40c5,
  0x40c5,
  0x40c4,
  0x40c4,
  0x40c3,
  0x40c2,
  0x40c2,
  0x40c2,
  0x40c1,
  0x40c1,
  0x40c0,
  0x40c0,
  0x40c0,
  0x40bf,
  0x40be,
  0x40be,
  0x40bd,
  0x40bd,
  0x40bd,
  0x40bc,
  0x40bb,
  0x40bb,
  0x40ba,
  0x40ba,
  0x40ba,
  0x40b9,
  0x40b9,
  0x40b8,
  0x40b8,
  0x40b7,
  0x40b7,
  0x40b6,
  0x40b6,
  0x40b5,
  0x40b5,
  0x40b5,
  0x40b5,
  0x40b5,
  0x40b4,
  0x40b4,
  0x40b4,
  0x40b4,
  0x40b4,
  0x40b3,
  0x40b3,
  0x40b3,
  0x40b3,
  0x40b3,
  0x40b3,
  0x40b3,
  0x40b2,
  0x40b2,
  0x40b2,
  0x40b2,
  0x40b2,
  0x40b1,
  0x40b1,
  0x40b1,
  0x40b1,
  0x40b1,
  0x40b0,
  0x40b0,
  0x40b0,
  0x40b0,
  0x40b0,
  0x40b0,
  0x40b0,
  0x40af,
  0x40af,
  0x40af,
  0x40af,
  0x40af,
  0x40ae,
  0x40ae,
  0x40ae,
  0x40ae,
  0x40ae,
  0x40ae,
  0x40ae,
  0x40ad,
  0x40ad,
  0x40ad,
  0x40ad,
  0x40ad,
  0x40ac,
  0x40ac,
  0x40ac,
  0x40ac,
  0x40ac,
  0x40ab,
  0x40ab,
  0x40ab,
  0x40ab,
  0x40ab,
  0x40ab,
  0x40ab,
  0x40aa,
  0x40aa,
  0x40aa,
  0x40aa,
  0x40aa,
  0x40a9,
  0x40a9,
  0x40a9,
  0x40a9,
  0x40a9,
  0x40a9,
  0x40a9,
  0x40a8,
  0x40a8,
  0x40a8,
  0x40a8,
  0x40a8,
  0x40a7,
  0x40a7,
  0x40a7,
  0x40a7,
  0x40a7,
  0x40a7,
  0x40a7,
  0x40a7,
  0x40a6,
  0x40a6,
  0x40a6,
  0x40a6,
  0x40a6,
  0x40a5,
  0x40a5,
  0x40a5,
  0x40a5,
  0x40a4,
  0x40a4,
  0x40a4,
  0x40a4,
  0x40a4,
  0x40a4,
  0x40a4,
  0x40a4,
  0x40a3,
  0x40a3,
  0x40a3,
  0x40a3,
  0x40a3,
  0x40a2,
  0x40a2,
  0x40a2,
  0x40a2,
  0x40a2,
  0x40a2,
  0x40a2,
  0x40a1,
  0x40a1,
  0x40a1,
  0x40a1,
  0x40a1,
  0x40a0,
  0x40a0,
  0x40a0,
  0x40a0,
  0x40a0,
  0x409f,
  0x409f,
  0x409f,
  0x409f,
  0x409f,
  0x409f,
  0x409f,
  0x409e,
  0x409e,
  0x409e,
  0x409e,
  0x409e,
  0x409d,
  0x409d,
  0x409d,
  0x409d,
  0x409d,
  0x409d,
  0x409d,
  0x409c,
  0x409c,
  0x409c,
  0x409c,
  0x409c,
  0x409b,
  0x409b,
  0x409b,
  0x409b,
  0x409b,
  0x409a,
  0x409a,
  0x409a,
  0x409a,
  0x409a,
  0x409a,
  0x409a,
  0x4099,
  0x4099,
  0x4099,
  0x4099,
  0x4099,
  0x4098,
  0x4098,
  0x4098,
  0x4098,
  0x4098,
  0x4098,
  0x4098,
  0x4098,
  0x4097,
  0x4097,
  0x4097,
  0x4097,
  0x4096,
  0x4096,
  0x4096,
  0x4096,
  0x4096,
  0x4096,
  0x4096,
  0x4096,
  0x4095,
  0x4095,
  0x4095,
  0x4095,
  0x4095,
  0x4094,
  0x4094,
  0x4094,
  0x4094,
  0x4094,
  0x4093,
  0x4093,
  0x4093,
  0x4093,
  0x4093,
  0x4093,
  0x4093,
  0x4092,
  0x4092,
  0x4092,
  0x4092,
  0x4092,
  0x4091,
  0x4091,
  0x4091,
  0x4091,
  0x4091,
  0x4091,
  0x4091,
  0x4090,
  0x4090,
  0x4090,
  0x4090,
  0x4090,
  0x408f,
  0x408f,
  0x408f,
  0x408f,
  0x408f,
  0x408e,
  0x408e,
  0x408e,
  0x408e,
  0x408e,
  0x408e,
  0x408e,
  0x408d,
  0x408d,
  0x408d,
  0x408d,
  0x408d,
  0x408c,
  0x408c,
  0x408c,
  0x408c,
  0x408c,
  0x408c,
  0x408c,
  0x408b,
  0x408b,
  0x408b,
  0x408b,
  0x408b,
  0x408a,
  0x408a,
  0x408a,
  0x408a,
  0x408a,
  0x408a,
  0x408a,
  0x408a,
  0x4089,
  0x4089,
  0x4089,
  0x4089,
  0x4088,
  0x4088,
  0x4088,
  0x4088,
  0x4088,
  0x4087,
  0x4087,
  0x4087,
  0x4087,
  0x4087,
  0x4087,
  0x4087,
  0x4087,
  0x4086,
  0x4086,
  0x4086,
  0x4086,
  0x4086,
  0x4085,
  0x4085,
  0x4085,
  0x4085,
  0x4085,
  0x4085,
  0x4085,
  0x4084,
  0x4084,
  0x4084,
  0x4084,
  0x4084,
  0x4083,
  0x4083,
  0x4083,
  0x4083,
  0x4083,
  0x4082,
  0x4082,
  0x4082,
  0x4082,
  0x4082,
  0x4082,
  0x4082,
  0x4081,
  0x4081,
  0x4081,
  0x4081,
  0x4081,
  0x4080,
  0x4080,
  0x4080,
  0x4080,
  0x4080,
  0x4080,
  0x407f,
  0x407f,
  0x407f,
  0x407f,
  0x407f,
  0x407f,
  0x407f,
  0x407f,
  0x407f,
  0x407f,
  0x407e,
  0x407e,
  0x407e,
  0x407e,
  0x407e,
  0x407e,
  0x407e,
  0x407e,
  0x407e,
  0x407d,
  0x407d,
  0x407d,
  0x407d,
  0x407d,
  0x407c,
  0x407c,
  0x407c,
  0x407c,
  0x407c,
  0x407c,
  0x407c,
  0x407c,
  0x407c,
  0x407c,
  0x407b,
  0x407b,
  0x407b,
  0x407b,
  0x407b,
  0x407b,
  0x407b,
  0x407b,
  0x407b,
  0x407b,
  0x407a,
  0x407a,
  0x407a,
  0x407a,
  0x407a,
  0x407a,
  0x407a,
  0x407a,
  0x407a,
  0x407a,
  0x4079,
  0x4079,
  0x4079,
  0x4079,
  0x4079,
  0x4078,
  0x4078,
  0x4078,
  0x4078,
  0x4078,
  0x4078,
  0x4078,
  0x4078,
  0x4078,
  0x4077,
  0x4077,
  0x4077,
  0x4077,
  0x4077,
  0x4077,
  0x4077,
  0x4077,
  0x4077,
  0x4077,
  0x4076,
  0x4076,
  0x4076,
  0x4076,
  0x4076,
  0x4075,
  0x4075,
  0x4075,
  0x4075,
  0x4075,
  0x4075,
  0x4075,
  0x4075,
  0x4075,
  0x4075,
  0x4074,
  0x4074,
  0x4074,
  0x4074,
  0x4074,
  0x4074,
  0x4074,
  0x4074,
  0x4074,
  0x4073,
  0x4073,
  0x4073,
  0x4073,
  0x4073,
  0x4073,
  0x4073,
  0x4073,
  0x4073,
  0x4073,
  0x4072,
  0x4072,
  0x4072,
  0x4072,
  0x4072,
  0x4071,
  0x4071,
  0x4071,
  0x4071,
  0x4071,
  0x4071,
  0x4071,
  0x4071,
  0x4071,
  0x4071,
  0x4070,
  0x4070,
  0x4070,
  0x4070,
  0x4070,
  0x4070,
  0x4070,
  0x4070,
  0x4070,
  0x4070,
  0x406f,
  0x406f,
  0x406f,
  0x406f,
  0x406f,
  0x406e,
  0x406e,
  0x406e,
  0x406e,
  0x406e,
  0x406e,
  0x406e,
  0x406e,
  0x406e,
  0x406d,
  0x406d,
  0x406d,
  0x406d,
  0x406d,
  0x406d,
  0x406d,
  0x406d,
  0x406d,
  0x406d,
  0x406c,
  0x406c,
  0x406c,
  0x406c,
  0x406c,
  0x406b,
  0x406b,
  0x406b,
  0x406b,
  0x406b,
  0x406b,
  0x406b,
  0x406b,
  0x406b,
  0x406b,
  0x406a,
  0x406a,
  0x406a,
  0x406a,
  0x406a,
  0x406a,
  0x406a,
  0x406a,
  0x406a,
  0x4069,
  0x4069,
  0x4069,
  0x4069,
  0x4069,
  0x4069,
  0x4069,
  0x4069,
  0x4069,
  0x4069,
  0x4068,
  0x4068,
  0x4068,
  0x4068,
  0x4068,
  0x4067,
  0x4067,
  0x4067,
  0x4067,
  0x4067,
  0x4067,
  0x4067,
  0x4067,
  0x4067,
  0x4067,
  0x4066,
  0x4066,
  0x4066,
  0x4066,
  0x4066,
  0x4066,
  0x4066,
  0x4066,
  0x4066,
  0x4066,
  0x4065,
  0x4065,
  0x4065,
  0x4065,
  0x4065,
  0x4064,
  0x4064,
  0x4064,
  0x4064,
  0x4064,
  0x4064,
  0x4064,
  0x4064,
  0x4064,
  0x4063,
  0x4063,
  0x4063,
  0x4063,
  0x4063,
  0x4063,
  0x4063,
  0x4063,
  0x4063,
  0x4063,
  0x4062,
  0x4062,
  0x4062,
  0x4062,
  0x4062,
  0x4062,
  0x4062,
  0x4062,
  0x4062,
  0x4062,
  0x4061,
  0x4061,
  0x4061,
  0x4061,
  0x4061,
  0x4060,
  0x4060,
  0x4060,
  0x4060,
  0x4060,
  0x4060,
  0x4060,
  0x4060,
  0x4060,
  0x405f,
  0x405f,
  0x405f,
  0x405f,
  0x405f,
  0x405f,
  0x405f,
  0x405f,
  0x405f,
  0x405f,
  0x405e,
  0x405e,
  0x405e,
  0x405e,
  0x405e,
  0x405d,
  0x405d,
  0x405d,
  0x405d,
  0x405d,
  0x405d,
  0x405d,
  0x405d,
  0x405d,
  0x405d,
  0x405c,
  0x405c,
  0x405c,
  0x405c,
  0x405c,
  0x405c,
  0x405c,
  0x405c,
  0x405c,
  0x405c,
  0x405b,
  0x405b,
  0x405b,
  0x405b,
  0x405b,
  0x405a,
  0x405a,
  0x405a,
  0x405a,
  0x405a,
  0x405a,
  0x405a,
  0x405a,
  0x405a,
  0x4059,
  0x4059,
  0x4059,
  0x4059,
  0x4059,
  0x4059,
  0x4059,
  0x4059,
  0x4059,
  0x4059,
  0x4058,
  0x4058,
  0x4058,
  0x4058,
  0x4058,
  0x4058,
  0x4058,
  0x4058,
  0x4058,
  0x4058,
  0x4057,
  0x4057,
  0x4057,
  0x4057,
  0x4057,
  0x4056,
  0x4056,
  0x4056,
  0x4056,
  0x4056,
  0x4056,
  0x4056,
  0x4056,
  0x4056,
  0x4056,
  0x4055,
  0x4055,
  0x4055,
  0x4055,
  0x4055,
  0x4055,
  0x4055,
  0x4055,
  0x4055,
  0x4054,
  0x4054,
  0x4054,
  0x4054,
  0x4054,
  0x4053,
  0x4053,
  0x4053,
  0x4053,
  0x4053,
  0x4053,
};

static bool check_input_int8_range(float input) {
  bool ret = input > -128.0 && input < 128.0;
  if (!ret) {
    printf("invalid int8 range, input is %f\n", input);
  }
  return ret;
}

// <! gen invert sqrt
static double _gen_sqrt_inv(int base, int p) {
  // y = x ^ -0.5
  int m = 1;
  if (base < 0 && p % 2) {
    // cant sqrt with base, it need to hoist it
    // (-2)^(-31) -> -2 * (-2^-30)
    m = base;
    if (p == 0) {
      m = 1; // pow(base, 0) its fine
    }
    else if (p > 0) {
      p = p - 1;
    }
    else {
      // p < 0
      p = p + 1;
    }
  }

  double f = (double) (m * pow(base, p * -0.5));

  if (isnan(f)) {
    assert(0);
  }
  return f;
}

static void tl_lut_ref(
    u16 *ofmap,
    u16 *ifmap,
    u16 *table,
    u16 *table_slope,
    tl_shape_t ifmap_shape,
    tl_shape_t table_shape)
{
  int tn, th, tw;

  tn = table_shape.n;
  th = table_shape.h;
  tw = table_shape.w;
  assert(tn == 1);
  assert(th * tw == 256);
  assert(ofmap);
  assert(ifmap);
  assert(table);
  assert(table_slope);
  assert(tl_shape_size(&ifmap_shape));

  // TODO: use c function
  // TODO: cal error with `eval_lut.py`
#if 0
  // 1. dump all input as binary file
  #define INFP32FILE "inv_infp32file.bin"
  #define OUTBF16FILE "inv_lutbf16out.bin"
  FILE* pFile;
  pFile = fopen(INFP32FILE, "wb");
  fwrite(ifmap, 1, tl_shape_size(&ifmap_shape) *sizeof(u16), pFile);
  fclose(pFile);

  // 2. read result from `eval_lut.py`
  char command[256];
  // func_id 4 means invsqrt
  // lut_type_id 1 means exp
  sprintf(command, "python eval_lut.py --lut_input_range_start %d --lut_input_range_end %d --func_id 4 --lut_type_id 1 --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n",
      exp_start, exp_end,
      INFP32FILE, OUTBF16FILE);

  // printf ("command is %s\n", command);
  system(command);

  pFile = fopen(OUTBF16FILE, "rb");
  if (!pFile) {
    fprintf(stderr, "open golden %s fail\n", OUTBF16FILE);
    exit(-1);
  }

  fread(ofmap, sizeof(u16), tl_shape_size(&ifmap_shape), pFile);
  fclose(pFile);
#endif

#if 0
  for (u64 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
    printf ("ref %" PRIu64 " input %x golden %x\n", i, ifmap[i], ofmap[i]);
  }
#endif
}

static void gen_sqrt_inv(u16 *table_data, u64 table_size) {
  // S(x) = 1 / (1 + (e^-x))
  //<! 32*8 table, duplicate `channel` times;
  int half = table_size / channel / 2;
  u64 idx = 0;
  assert(table_size);
  assert(half == 128);

  // prepare channel 0
  double s = _gen_sqrt_inv(2, exp_start);
  sqrt_hw[idx] = s;
  table_data[idx] = convert_fp32_bf16(s);
#if 0
  printf("t [%" PRIu64 "] is %f(%.8lf)[2^%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, range_start, table_data[idx]);
#endif
  idx++;

  // log scale range from 2^-62 ~ 2^+63
  // and -2^-62 ~ -2^+63

  // > 0, exp from 0 -62 -61 ..  62  63
  for (int i = 0; i < half; i++) {
    float exp = exp_start + i;
    double s = _gen_sqrt_inv(2, exp);
    sqrt_hw[idx] = s;
    table_data[idx] = convert_fp32_bf16(s);
#if 0
    printf("t [%" PRIu64 "] is %f(%e - %.8lf)[2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
#endif
    idx++;
  }

  //// idx = 127 dont care
  s = _gen_sqrt_inv(2, -0);
  sqrt_hw[idx] = s;
  table_data[idx] = convert_fp32_bf16(s);
#if 0
  printf("t [%" PRIu64 "] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), 0, table_data[idx]);
#endif
  idx++;

  for (int i = 1; i < half; i++) {
    float exp = exp_start + i;
    double s = _gen_sqrt_inv(-2, exp);
    sqrt_hw[idx] = s;
    table_data[idx] = convert_fp32_bf16(s);
#if 0
    printf("t [%" PRIu64 "] is %f(%e - %.8lf)[(-2)^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
#endif
    idx++;
  }

  // idx = 255 dont care
  //s = _gen_sqrt_inv(2, 0);
  //table_data[idx] = convert_fp32_bf16(s);
  //printf("t [%" PRIu64 "] is %f[%d]\n", idx, convert_bf16_fp32(table_data[idx]), 0);
  //idx++;

#if 0
  for (u32 i = 0; i < table_hw; i++) {
    printf("t [%u] is %f\n", i, convert_bf16_fp32(table_data[i]));
  }
#endif

  // duplicate channel #1 to #31
  //TODO: tensor copy
  for (u32 i = 1; i < channel; i++) {
    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
  }
}

static void gen_sqrt_inv_slope(u16 IN *table_data, u16* OUT table_slope, u64 table_size) {

  u32 half = table_size / channel / 2;
  assert(half == 128);
  assert(table_data);

  int idx = 0;
  int i = 0;
  double f_x0 = sqrt_hw[i];
  double f_x1 = sqrt_hw[i+1];
  double x0 = 0;
  double x1 = pow(2.0, exp_start);
  double s = (f_x1 - f_x0) / (x1 - x0);
  table_slope[idx] = convert_fp32_bf16(s);
#if 0
  printf ("slope [%u]  = %f, 0x%x(org:%e(%.8lf)) f_x0 %lf f_x1 %lf\n", 
        i, convert_bf16_fp32(table_slope[i]), table_slope[i], s, s, f_x0, f_x1);
#endif
  idx++;

  for (u32 i = 0; i < table_hw; i++) {
    double f_x0 = sqrt_hw[idx];
    double f_x1 = sqrt_hw[idx+1];
    int shift = 0;
    int sign = 1;
    if (idx >= 128) {
      shift = 128;
      sign = -1;
    }
    double exp = exp_start + (double)i - (double)shift;
    double x0 = pow(sign * 2.0, exp);
    double x1 = pow(sign * 2.0, exp + 1);
    if (idx == 127 || idx >= 255) {
      double s = 0.0;
      table_slope[idx] = convert_fp32_bf16(s); // not used
      idx++;
      continue;
    }
    else if (idx == 128) {
      x0 = 0;
      exp = exp_start; //<! for asset check
    }
#if 0
    printf ("[%u] x0 is %e %.16lf x1 is %.16lf, exp is %f\n", idx, x0, x1, exp);
#endif
    assert (!isinf(x0) && !isinf(x1));
    assert(exp >= exp_start && exp <= exp_end);
    
    double s = (f_x1 - f_x0) / (x1 - x0);
    table_slope[idx] = convert_fp32_bf16(s);
#if 0
    printf ("slope [%u]  = %f, 0x%x(org:%e(%.8lf)) (%.8lf - %.8lf) / (%.8lf - %.8lf), diif is %d\n",
        idx, convert_bf16_fp32(table_slope[idx]), table_slope[idx], s, s, 
        f_x1, f_x0, x1, x0, exp_start + i - shift);
#endif
    idx++;
  }

  // duplicate channel #1 to #31
  //TODO: tensor copy
  for (u64 i = 1; i < channel; i++) {
    memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(u16) * table_hw);
  }
}

static bool verify(u16 *ofmap_data, u16 *ref_data, u64 ofmap_size) {
  u64 size = ofmap_size;
  if (mode == PRE_DATA_COMPARE_FIX) {
    size = sizeof(sigmode_golden_bf16) / sizeof(sigmode_golden_bf16[0]);
  }

  for (u64 i = 0; i < size; i++) {
    u16 ref = ref_data[i];
    if (mode == PRE_DATA_COMPARE_FIX) {
      ref = sigmode_golden_bf16[i];
    }

    if (ofmap_data[i] != ref) {
      fprintf(stderr,
          "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
          i, ofmap_data[i], ref_data[i]);
      exit(-1);

#if 0
      for (u64 i = 0; i < ofmap_size; i++) {
        printf("error, dump all to [%" PRIx64 "]%" PRIu64 " source %x ref %x\n", i, i, ofmap_data[i], ref_data[i]);
      }

#endif
    }
  }
  return true;
}

static void test_tl_int8_lut_bf16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
{
  // TODO: check more shape / align
  tl_shape_t ifmap_shape;
  if (mode == PRE_DATA_COMPARE_FIX) {
    ifmap_shape = {1, channel, 8, 8};
  }
  else {
    ifmap_shape = {1, channel, 16, 16};
  }

  tl_shape_t table_shape = {1, channel, table_h, table_w}; // hard code for hw, hw:32x8
  tl_shape_t ofmap_shape = ifmap_shape;

  u64 ifmap_size = tl_shape_size(&ifmap_shape);
  u64 table_size = tl_shape_size(&table_shape);
  u64 ofmap_size = tl_shape_size(&ofmap_shape);

  fmt_t fmt = FMT_BF16;

  int data_type_size = bytesize_of_fmt(fmt);
  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
  u64 table_bytesize  =  table_size * data_type_size;
  u64 ofmap_bytesize  =  ofmap_size * data_type_size;

  // hw ONLY support index in int8
  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
  memset(ifmap, 0x00, ifmap_bytesize);

  u16 *ifmap_slope = (u16 *)xmalloc(ifmap_bytesize);
  memset(ifmap_slope, 0x00, ifmap_bytesize);

  if (mode == PRE_DATA_COMPARE_FIX) {
    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
#if 0
    for (u64 i = 0; i < ifmap_size; i++) {
      printf("source if[%" PRIu64 "] is %e bf16 %f (bf16)with 0x%x log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), convert_bf16_fp32(ifmap[i]), ifmap[i],
          log2f(convert_bf16_fp32(ifmap[i]))); 
    }
#endif
  }
  else {
    for (u64 i = 0; i < ifmap_size; i++) {
      // input range 0.001 - 32
      float input = ((int)i % 31) + (i % 100) * 0.012;
      assert(check_input_int8_range(input));
      ifmap[i] = convert_fp32_bf16(input);
#if 0
      printf("source if[%" PRIu64 "] is bf16 %f, input is %f (bf16)with 0x%x\n", i, convert_bf16_fp32(ifmap[i]), input, ifmap[i]); 
#endif
    }
  }

  u16 *table_data = (u16 *)xmalloc(table_bytesize);
  gen_sqrt_inv (table_data, table_size);

  u16 *table_data_slope = (u16 *)xmalloc(table_bytesize);
  gen_sqrt_inv_slope(table_data, table_data_slope, table_size);

  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
  tl_lut_ref(ref_data, ifmap, table_data, table_data_slope, ifmap_shape, table_shape);

  tl_t *tl_ifmap =
    alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
  tl_t *tl_table_answer =
    alloc_tl(bmk, table_shape, fmt, /*align*/1);
  tl_t *tl_table_answer_slope =
    alloc_tl(bmk, table_shape, fmt, /*align*/1);

  tl_t *tl_ofmap_A_idx =
    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
  tl_t *tl_ofmap_B_slope =
    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
  tl_t *tl_ofmap_A_base_val =
    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
  tl_t *tl_ofmap_A_base =
    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
  tl_t *tl_ofmap_C =
    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);

  // <! FIXME: prepare it
  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p1, copy_p2, copy_p3;
  memset(&copy_p1, 0, sizeof(copy_p1));
  memset(&copy_p2, 0, sizeof(copy_p2));
  memset(&copy_p3, 0, sizeof(copy_p3));
  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, ifmap, fmt, &copy_p1);
  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, table_data, fmt, &copy_p2);
  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_slope, table_data_slope, fmt, &copy_p3);

  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p1.src, &copy_p1); // input
  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // table value
  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3); // table slope

  // <! get base (x0)
  bmk1880v2_tdma_l2l_tensor_copy_param_t p10;
  memset(&p10, 0x00, sizeof(p10));
  p10.dst = tl_ofmap_A_base;
  p10.src = tl_ifmap;
  p10.mv_lut_base = true;
  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
  test_submit(ctx);

  // <! get index(pow)
  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
  p10.dst = tl_ofmap_A_idx;
  p10.src = tl_ifmap;
  p10.mv_lut_idx = true;
  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
  test_submit(ctx);

  // <! get f(x0)
  bmk1880v2_tiu_lookup_table_param_t p12;
  memset(&p12, 0, sizeof(p12));
  p12.ofmap = tl_ofmap_A_base_val;
  p12.ifmap = tl_ofmap_A_idx;
  p12.table = tl_table_answer;
  bmk1880v2_tiu_lookup_table(bmk, &p12);

  // <! get slope by index
  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
  memset(&p12, 0x0, sizeof(bmk1880v2_tiu_lookup_table_param_t));
  p12.ofmap = tl_ofmap_B_slope;
  p12.ifmap = tl_ofmap_A_idx;
  p12.table = tl_table_answer_slope;
  bmk1880v2_tiu_lookup_table(bmk, &p12);

  // <! sub, diff base , a - b
  // (x - x0)
  bmk1880v2_tiu_element_wise_sub_param_t p5;
  memset(&p5, 0, sizeof(p5));
  p5.res_high = 0;
  p5.res_low = tl_ofmap_C;
  p5.a_high = 0;
  p5.a_low = tl_ifmap;
  p5.b_high = 0;
  p5.b_low = tl_ofmap_A_base;
  p5.rshift_bits = 0;
  bmk1880v2_tiu_element_wise_sub(bmk, &p5);

  // <! mac
  // <! part A + part B, a * b + res = res
  bmk1880v2_tiu_element_wise_mac_param_t p2;
  memset(&p2, 0, sizeof(p2));
  p2.res_high = 0;
  p2.res_low = tl_ofmap_A_base_val;
  p2.res_is_int8 = 0;
  p2.a = tl_ofmap_C;
  p2.b_is_const = 0;
  p2.b = tl_ofmap_B_slope;
  p2.lshift_bits = 0;//lshift_bits;
  p2.rshift_bits = 0;//rshift_bits;
  p2.relu_enable = 0;
  bmk1880v2_tiu_element_wise_mac(bmk, &p2);
  test_submit(ctx);

  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bmk, tl_ofmap_A_base_val, fmt);
  verify(ofmap_data, ref_data, ofmap_size);

  free_tl(bmk, tl_ofmap_C);
  free_tl(bmk, tl_ofmap_A_base);
  free_tl(bmk, tl_ofmap_A_base_val);
  free_tl(bmk, tl_ofmap_B_slope);
  free_tl(bmk, tl_ofmap_A_idx);
  free_tl(bmk, tl_table_answer_slope);
  free_tl(bmk, tl_table_answer);
  free_tl(bmk, tl_ifmap);

  free(ifmap);
  free(ifmap_slope);
  free(table_data);
  free(table_data_slope);
  free(ref_data);
  free(ofmap_data);
}

int main()
{
  CVI_RT_HANDLE ctx;
  bmk_ctx_t *bmk;
  int round_mode;

  round_mode = set_store_feround();

  test_init(&ctx, &bmk);

  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
    mode = static_cast<TEST_MODE>(i);
    printf ("test mode %d...\n", mode);
    test_tl_int8_lut_bf16(&ctx, bmk);
  }

  test_exit(&ctx);
  restore_feround(round_mode);
  return 0;
}
